The Thera bank recently saw a steep decline in the number of users of their credit card, credit cards are a good source of income for banks because of different kinds of fees charged by the banks like annual fees, balance transfer fees, and cash advance fees, late payment fees, foreign transaction fees, and others. Some fees are charged to every user irrespective of usage, while others are charged under specified circumstances.
Customers’ leaving credit cards services would lead bank to loss, so the bank wants to analyze the data of customers and identify the customers who will leave their credit card services and reason for same – so that bank could improve upon those areas
You as a Data scientist at Thera bank need to come up with a classification model that will help the bank improve its services so that customers do not renounce their credit cards
You need to identify the best possible model that will give the required performance
# To help with reading and manipulating data
import pandas as pd
import numpy as np
# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# To be used for missing value imputation
from sklearn.impute import SimpleImputer
# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
BaggingClassifier,
)
from xgboost import XGBClassifier
# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
)
# To impute missing values
from sklearn.impute import KNNImputer
# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)
# To supress warnings
import warnings
warnings.filterwarnings("ignore")
# This will help in making the Python code more structured automatically (good coding practice)
%load_ext nb_black
# Load the data into pandas dataframe
Bank = pd.read_csv("BankChurners.csv")
# Coping the data to another variable to avoid any changes to the original data
df = Bank.copy()
df.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.000 | 777 | 11914.000 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.000 | 864 | 7392.000 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.000 | 0 | 3418.000 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.000 | 2517 | 796.000 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.000 | 0 | 4716.000 | 2.175 | 816 | 28 | 2.500 | 0.000 |
# Understanding the shape of the data
print(
f"There are \033[1;4m{df.shape[0]}\033[m rows and \033[1;4m{df.shape[1]}\033[m columns."
)
There are 10127 rows and 21 columns.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
# Changing target columns to 0 and 1
df.Attrition_Flag = df.Attrition_Flag.apply(
lambda x: 0 if x == "Existing Customer" else "1"
).astype("int64")
# let's check for duplicate values in the data
df.duplicated().sum()
0
# Checking the percentage of missing values in each column
pd.DataFrame(
data={
"% of Missing Values": round(
df.isna().sum() / df.isna().count() * 100, 2
).sort_values(ascending=False)
}
)
| % of Missing Values | |
|---|---|
| Education_Level | 15.000 |
| Marital_Status | 7.400 |
| Avg_Utilization_Ratio | 0.000 |
| Months_on_book | 0.000 |
| Attrition_Flag | 0.000 |
| Customer_Age | 0.000 |
| Gender | 0.000 |
| Dependent_count | 0.000 |
| Income_Category | 0.000 |
| Card_Category | 0.000 |
| Total_Relationship_Count | 0.000 |
| Total_Ct_Chng_Q4_Q1 | 0.000 |
| Months_Inactive_12_mon | 0.000 |
| Contacts_Count_12_mon | 0.000 |
| Credit_Limit | 0.000 |
| Total_Revolving_Bal | 0.000 |
| Avg_Open_To_Buy | 0.000 |
| Total_Amt_Chng_Q4_Q1 | 0.000 |
| Total_Trans_Amt | 0.000 |
| Total_Trans_Ct | 0.000 |
| CLIENTNUM | 0.000 |
Education_Level column has 15% missing values out of the total observations.Marital_Status column has 7.4% missing values out of the total observations.# Checking the number of unique values in each columns
df.nunique()
CLIENTNUM 10127 Attrition_Flag 2 Customer_Age 45 Gender 2 Dependent_count 6 Education_Level 6 Marital_Status 3 Income_Category 6 Card_Category 4 Months_on_book 44 Total_Relationship_Count 6 Months_Inactive_12_mon 7 Contacts_Count_12_mon 7 Credit_Limit 6205 Total_Revolving_Bal 1974 Avg_Open_To_Buy 6813 Total_Amt_Chng_Q4_Q1 1158 Total_Trans_Amt 5033 Total_Trans_Ct 126 Total_Ct_Chng_Q4_Q1 830 Avg_Utilization_Ratio 964 dtype: int64
CLIENTNUM as it is unique for each customer and will not add value to the model.# Dropping CLIENTNUM column
df.drop(columns="CLIENTNUM", inplace=True)
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | 10127.000 | 0.161 | 0.367 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 |
| Customer_Age | 10127.000 | 46.326 | 8.017 | 26.000 | 41.000 | 46.000 | 52.000 | 73.000 |
| Dependent_count | 10127.000 | 2.346 | 1.299 | 0.000 | 1.000 | 2.000 | 3.000 | 5.000 |
| Months_on_book | 10127.000 | 35.928 | 7.986 | 13.000 | 31.000 | 36.000 | 40.000 | 56.000 |
| Total_Relationship_Count | 10127.000 | 3.813 | 1.554 | 1.000 | 3.000 | 4.000 | 5.000 | 6.000 |
| Months_Inactive_12_mon | 10127.000 | 2.341 | 1.011 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Contacts_Count_12_mon | 10127.000 | 2.455 | 1.106 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Credit_Limit | 10127.000 | 8631.954 | 9088.777 | 1438.300 | 2555.000 | 4549.000 | 11067.500 | 34516.000 |
| Total_Revolving_Bal | 10127.000 | 1162.814 | 814.987 | 0.000 | 359.000 | 1276.000 | 1784.000 | 2517.000 |
| Avg_Open_To_Buy | 10127.000 | 7469.140 | 9090.685 | 3.000 | 1324.500 | 3474.000 | 9859.000 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 10127.000 | 0.760 | 0.219 | 0.000 | 0.631 | 0.736 | 0.859 | 3.397 |
| Total_Trans_Amt | 10127.000 | 4404.086 | 3397.129 | 510.000 | 2155.500 | 3899.000 | 4741.000 | 18484.000 |
| Total_Trans_Ct | 10127.000 | 64.859 | 23.473 | 10.000 | 45.000 | 67.000 | 81.000 | 139.000 |
| Total_Ct_Chng_Q4_Q1 | 10127.000 | 0.712 | 0.238 | 0.000 | 0.582 | 0.702 | 0.818 | 3.714 |
| Avg_Utilization_Ratio | 10127.000 | 0.275 | 0.276 | 0.000 | 0.023 | 0.176 | 0.503 | 0.999 |
df.describe(include=object).T
| count | unique | top | freq | |
|---|---|---|---|---|
| Gender | 10127 | 2 | F | 5358 |
| Education_Level | 8608 | 6 | Graduate | 3128 |
| Marital_Status | 9378 | 3 | Married | 4687 |
| Income_Category | 10127 | 6 | Less than $40K | 3561 |
| Card_Category | 10127 | 4 | Blue | 9436 |
Let's check the count of each unique category in each of the categorical variables.
# Making a list of all categorical variables
cat_col = df.select_dtypes(include="object").columns.tolist()
cat_num = [
"Dependent_count",
"Total_Relationship_Count",
"Months_Inactive_12_mon",
"Contacts_Count_12_mon",
]
for col in cat_num:
cat_col.append(col)
cat_col
['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 'Dependent_count', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon']
# Printing number of count of each unique values in each categorical column
for i in cat_col:
print(f"Unique values in \033[1m{i}\033[m are:")
print(df[i].value_counts())
print("\n", "*" * 50, "\n")
Unique values in Gender are: F 5358 M 4769 Name: Gender, dtype: int64 ************************************************** Unique values in Education_Level are: Graduate 3128 High School 2013 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 ************************************************** Unique values in Marital_Status are: Married 4687 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64 ************************************************** Unique values in Income_Category are: Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 abc 1112 $120K + 727 Name: Income_Category, dtype: int64 ************************************************** Unique values in Card_Category are: Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64 ************************************************** Unique values in Dependent_count are: 3 2732 2 2655 1 1838 4 1574 0 904 5 424 Name: Dependent_count, dtype: int64 ************************************************** Unique values in Total_Relationship_Count are: 3 2305 4 1912 5 1891 6 1866 2 1243 1 910 Name: Total_Relationship_Count, dtype: int64 ************************************************** Unique values in Months_Inactive_12_mon are: 3 3846 2 3282 1 2233 4 435 5 178 6 124 0 29 Name: Months_Inactive_12_mon, dtype: int64 ************************************************** Unique values in Contacts_Count_12_mon are: 3 3380 2 3227 1 1499 4 1392 0 399 5 176 6 54 Name: Contacts_Count_12_mon, dtype: int64 **************************************************
# Replacing 'abc' with Nan
df.Income_Category = df.Income_Category.replace("abc", np.nan)
# Checking the replaciment on Income_Category as NaN
df.isnull().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 1112 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
# Converting the data type of each categorical variable to 'category'
for column in cat_col:
df[column] = df[column].astype("category")
# Checking the DType
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null int64 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null category 4 Education_Level 8608 non-null category 5 Marital_Status 9378 non-null category 6 Income_Category 9015 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null category 10 Months_Inactive_12_mon 10127 non-null category 11 Contacts_Count_12_mon 10127 non-null category 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: category(9), float64(5), int64(6) memory usage: 961.3 KB
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to show the density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data,
x=feature,
ax=ax_box2,
color="Khaki",
showmeans=True,
meanprops={
"marker": "o",
"markerfacecolor": "red",
"markeredgecolor": "red",
"markersize": "6",
},
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, color="SkyBlue"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, color="SkyBlue"
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="red", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
histogram_boxplot(df, "Customer_Age")
df[df["Customer_Age"] > 69]
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 251 | 0 | 73 | M | 0 | High School | Married | $40K - $60K | Blue | 36 | 5 | 3 | 2 | 4469.000 | 1125 | 3344.000 | 1.363 | 1765 | 34 | 1.615 | 0.252 |
| 254 | 0 | 70 | M | 0 | High School | Married | Less than $40K | Blue | 56 | 3 | 2 | 3 | 3252.000 | 1495 | 1757.000 | 0.581 | 1227 | 15 | 0.875 | 0.460 |
histogram_boxplot(df, "Months_on_book")
df[(df["Months_on_book"] < 18) | (df["Months_on_book"] > 53)]["Months_on_book"].count()
386
histogram_boxplot(df, "Credit_Limit")
df[df["Credit_Limit"] > 25000]["Credit_Limit"].count()
892
histogram_boxplot(df, "Total_Revolving_Bal")
# Checking % of customers with Total_Revolving_Bal equal to Zero
df[df["Total_Revolving_Bal"] == 0]["Total_Revolving_Bal"].count() / len(
df["Total_Revolving_Bal"]
) * 100
24.390243902439025
histogram_boxplot(df, "Avg_Open_To_Buy")
histogram_boxplot(df, "Total_Trans_Amt")
histogram_boxplot(df, "Total_Trans_Ct")
df[df["Total_Trans_Ct"] > 135]
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9324 | 0 | 41 | M | 3 | NaN | Married | $120K + | Blue | 33 | 2 | 4 | 3 | 34516.000 | 638 | 33878.000 | 0.724 | 13085 | 139 | 0.675 | 0.018 |
| 9586 | 0 | 56 | F | 1 | High School | Married | NaN | Blue | 49 | 1 | 2 | 1 | 17542.000 | 2517 | 15025.000 | 0.800 | 13939 | 138 | 0.792 | 0.143 |
histogram_boxplot(df, "Total_Ct_Chng_Q4_Q1")
df[df["Total_Ct_Chng_Q4_Q1"] > 1]["Total_Ct_Chng_Q4_Q1"].count() / len(
df["Total_Ct_Chng_Q4_Q1"]
)
0.06596227905598893
df[df["Total_Ct_Chng_Q4_Q1"] < 0.5]["Total_Ct_Chng_Q4_Q1"].count() / len(
df["Total_Ct_Chng_Q4_Q1"]
)
0.13528191962081565
histogram_boxplot(df, "Total_Amt_Chng_Q4_Q1")
df[df["Total_Amt_Chng_Q4_Q1"] > 1]["Total_Amt_Chng_Q4_Q1"].count() / len(
df["Total_Amt_Chng_Q4_Q1"]
)
0.09805470524340872
df[df["Total_Amt_Chng_Q4_Q1"] < 0.5]["Total_Amt_Chng_Q4_Q1"].count() / len(
df["Total_Amt_Chng_Q4_Q1"]
)
0.07198578058655081
histogram_boxplot(df, "Avg_Utilization_Ratio")
df[df["Avg_Utilization_Ratio"] > 0.8]["Avg_Utilization_Ratio"].count() / len(
df["Avg_Utilization_Ratio"]
)
0.046114347783153944
# Function to create barplots that indicate percentage for each category.
def perc_on_bar(dataframe):
"""
plot
feature: categorical feature
the function won't work if a column is passed in hue parameter
"""
total = len(dataframe) # length of the column
plt.figure(figsize=(10, 5))
ax = sns.countplot(dataframe, palette="Set3")
for p in ax.patches:
percentage = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
x = p.get_x() + p.get_width() / 2 - 0.05 # width of the plot
y = p.get_y() + p.get_height() # hieght of the plot
ax.annotate(percentage, (x, y), size=12) # annotate the percantage
plt.show() # show the plot
perc_on_bar(df.Attrition_Flag)
perc_on_bar(df.Gender)
perc_on_bar(df.Education_Level)
perc_on_bar(df.Marital_Status)
perc_on_bar(df.Income_Category)
perc_on_bar(df.Card_Category)
perc_on_bar(df.Total_Relationship_Count)
perc_on_bar(df.Months_Inactive_12_mon)
perc_on_bar(df.Contacts_Count_12_mon)
df.corr()["Attrition_Flag"].sort_values(ascending=False)
Attrition_Flag 1.000 Customer_Age 0.018 Months_on_book 0.014 Avg_Open_To_Buy -0.000 Credit_Limit -0.024 Total_Amt_Chng_Q4_Q1 -0.131 Total_Trans_Amt -0.169 Avg_Utilization_Ratio -0.178 Total_Revolving_Bal -0.263 Total_Ct_Chng_Q4_Q1 -0.290 Total_Trans_Ct -0.371 Name: Attrition_Flag, dtype: float64
sns.pairplot(data=df, hue="Attrition_Flag")
<seaborn.axisgrid.PairGrid at 0x1337993cc70>
As expected:
Attrited Customer (account is closed) shows some partner as:
plt.figure(figsize=(12, 7))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="BrBG")
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
perfectly correlated and will not add value to our analysis.# Creating a lis of numerical variables
num_col = []
for col in df.columns:
if col not in cat_col:
num_col.append(col)
num_col.remove("Attrition_Flag")
# Boxplot for numerical variables
plt.figure(figsize=(15, 15))
for i, variable in enumerate(num_col):
plt.subplot(len(num_col) / 2, 2, i + 1)
sns.boxplot(df["Attrition_Flag"], df[variable])
plt.tight_layout()
plt.title(variable)
plt.show()
Let's define one more function to plot stacked bar charts
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 1, 5))
plt.legend(
loc="lower left",
frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
stacked_barplot(df, "Gender", "Attrition_Flag")
Attrition_Flag 0 1 All Gender All 8500 1627 10127 F 4428 930 5358 M 4072 697 4769 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Education_Level", "Attrition_Flag")
Attrition_Flag 0 1 All Education_Level All 7237 1371 8608 Graduate 2641 487 3128 High School 1707 306 2013 Uneducated 1250 237 1487 College 859 154 1013 Doctorate 356 95 451 Post-Graduate 424 92 516 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Marital_Status", "Attrition_Flag")
Attrition_Flag 0 1 All Marital_Status All 7880 1498 9378 Married 3978 709 4687 Single 3275 668 3943 Divorced 627 121 748 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Income_Category", "Attrition_Flag")
Attrition_Flag 0 1 All Income_Category All 7575 1440 9015 Less than $40K 2949 612 3561 $40K - $60K 1519 271 1790 $80K - $120K 1293 242 1535 $60K - $80K 1213 189 1402 $120K + 601 126 727 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Card_Category", "Attrition_Flag")
Attrition_Flag 0 1 All Card_Category All 8500 1627 10127 Blue 7917 1519 9436 Silver 473 82 555 Gold 95 21 116 Platinum 15 5 20 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Dependent_count", "Attrition_Flag")
Attrition_Flag 0 1 All Dependent_count All 8500 1627 10127 3 2250 482 2732 2 2238 417 2655 1 1569 269 1838 4 1314 260 1574 0 769 135 904 5 360 64 424 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Total_Relationship_Count", "Attrition_Flag")
Attrition_Flag 0 1 All Total_Relationship_Count All 8500 1627 10127 3 1905 400 2305 2 897 346 1243 1 677 233 910 5 1664 227 1891 4 1687 225 1912 6 1670 196 1866 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Months_Inactive_12_mon", "Attrition_Flag")
Attrition_Flag 0 1 All Months_Inactive_12_mon All 8500 1627 10127 3 3020 826 3846 2 2777 505 3282 4 305 130 435 1 2133 100 2233 5 146 32 178 6 105 19 124 0 14 15 29 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Contacts_Count_12_mon", "Attrition_Flag")
Attrition_Flag 0 1 All Contacts_Count_12_mon All 8500 1627 10127 3 2699 681 3380 2 2824 403 3227 4 1077 315 1392 1 1391 108 1499 5 117 59 176 6 0 54 54 0 392 7 399 ------------------------------------------------------------------------------------------------------------------------
sns.catplot(
x="Months_Inactive_12_mon",
y="Total_Amt_Chng_Q4_Q1",
hue="Attrition_Flag",
kind="swarm",
data=df,
)
<seaborn.axisgrid.FacetGrid at 0x13304bfe610>
plt.figure(figsize=(10, 6))
sns.scatterplot(
df["Total_Revolving_Bal"], df["Total_Trans_Amt"], hue=df["Attrition_Flag"]
)
<AxesSubplot:xlabel='Total_Revolving_Bal', ylabel='Total_Trans_Amt'>
plt.figure(figsize=(10, 6))
sns.scatterplot(
df["Total_Amt_Chng_Q4_Q1"], df["Total_Trans_Amt"], hue=df["Attrition_Flag"]
)
<AxesSubplot:xlabel='Total_Amt_Chng_Q4_Q1', ylabel='Total_Trans_Amt'>
plt.figure(figsize=(10, 6))
sns.boxplot(
df["Total_Relationship_Count"], df["Total_Trans_Amt"], hue=df["Attrition_Flag"]
)
<AxesSubplot:xlabel='Total_Relationship_Count', ylabel='Total_Trans_Amt'>
plt.figure(figsize=(10, 6))
sns.boxplot(
df["Total_Relationship_Count"], df["Total_Revolving_Bal"], hue=df["Attrition_Flag"]
)
<AxesSubplot:xlabel='Total_Relationship_Count', ylabel='Total_Revolving_Bal'>
plt.figure(figsize=(10, 6))
sns.boxplot(df["Card_Category"], df["Total_Revolving_Bal"], hue=df["Attrition_Flag"])
<AxesSubplot:xlabel='Card_Category', ylabel='Total_Revolving_Bal'>
plt.figure(figsize=(10, 6))
sns.boxplot(df["Income_Category"], df["Total_Trans_Ct"], hue=df["Attrition_Flag"])
<AxesSubplot:xlabel='Income_Category', ylabel='Total_Trans_Ct'>
plt.figure(figsize=(10, 6))
sns.boxplot(df["Contacts_Count_12_mon"], df["Total_Trans_Ct"], hue=df["Attrition_Flag"])
<AxesSubplot:xlabel='Contacts_Count_12_mon', ylabel='Total_Trans_Ct'>
data1 = df.copy()
# Dropping Avg Open to Buy considering their high correlation with other variables
data1.drop(
columns=[
"Avg_Open_To_Buy",
],
inplace=True,
)
# Separating target variable and other variables
X = data1.drop(columns="Attrition_Flag")
Y = data1["Attrition_Flag"]
# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, Y, test_size=0.2, random_state=1, stratify=Y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 18) (2026, 18) (2026, 18)
data1.isnull().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 1112 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
# Let's impute the missing values
imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
# fit the imputer on train data and transform the train data
X_train["Education_Level"] = imp_mode.fit_transform(X_train[["Education_Level"]])
# transform the validation and test data using the imputer fit on train data
X_val["Education_Level"] = imp_mode.transform(X_val[["Education_Level"]])
# transform the validation and test data using the imputer fit on train data
X_test["Education_Level"] = imp_mode.transform(X_test[["Education_Level"]])
# Let's impute the missing values
imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
# fit the imputer on train data and transform the train data
X_train["Marital_Status"] = imp_mode.fit_transform(X_train[["Marital_Status"]])
# transform the validation and test data using the imputer fit on train data
X_val["Marital_Status"] = imp_mode.transform(X_val[["Marital_Status"]])
# transform the validation and test data using the imputer fit on train data
X_test["Marital_Status"] = imp_mode.transform(X_test[["Marital_Status"]])
# Let's impute the missing values
imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
# fit the imputer on train data and transform the train data
X_train["Income_Category"] = imp_mode.fit_transform(X_train[["Income_Category"]])
# transform the validation and test data using the imputer fit on train data
X_val["Income_Category"] = imp_mode.transform(X_val[["Income_Category"]])
# transform the validation and test data using the imputer fit on train data
X_test["Income_Category"] = imp_mode.transform(X_test[["Income_Category"]])
# Creating dummy variables for categorical variables
X_train = pd.get_dummies(data=X_train, drop_first=True)
X_val = pd.get_dummies(data=X_val, drop_first=True)
X_test = pd.get_dummies(data=X_test, drop_first=True)
X_train.shape
(6075, 46)
Let's evaluate the model performance by using KFold and cross_val_score
K-Folds cross-validation provides dataset indices to split data into train/validation sets. Split dataset into k consecutive stratified folds (without shuffling by default). Each fold is then used once as validation while the k - 1 remaining folds form the training set.models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {:.3f}".format(name, cv_result.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models:
model.fit(X_train, y_train)
scores = recall_score(y_train, model.predict(X_train)) * 100
print("{}: {:.3f}".format(name, scores))
print("\n" "Validation Performance:" "\n")
for name, model in models:
model.fit(X_train, y_train)
scores_val = recall_score(y_val, model.predict(X_val)) * 100
print("{}: {:.3f}".format(name, scores_val))
Cross-Validation Performance: Bagging: 76.636 Random forest: 71.512 GBM: 80.427 Adaboost: 80.835 Xgboost: 84.935 dtree: 78.173 Training Performance: Bagging: 98.463 Random forest: 100.000 GBM: 87.090 Adaboost: 83.607 Xgboost: 100.000 dtree: 100.000 Validation Performance: Bagging: 80.675 Random forest: 73.620 GBM: 81.595 Adaboost: 81.288 Xgboost: 89.571 dtree: 78.221
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
### Oversampling train data using SMOTE
print("Before OverSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After OverSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After OverSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After OverSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After OverSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before OverSampling, counts of label 'Yes': 976 Before OverSampling, counts of label 'No': 5099 After OverSampling, counts of label 'Yes': 5099 After OverSampling, counts of label 'No': 5099 After OverSampling, the shape of train_X: (10198, 46) After OverSampling, the shape of train_y: (10198,)
models_over = [] # Empty list to store all the models
# Appending models into the list
models_over.append(("Bagging_SMOTE", BaggingClassifier(random_state=1)))
models_over.append(("RandomForest_SMOTE", RandomForestClassifier(random_state=1)))
models_over.append(("GBM_SMOTE", GradientBoostingClassifier(random_state=1)))
models_over.append(("Adaboost_SMOTE", AdaBoostClassifier(random_state=1)))
models_over.append(
("Xgboost_SMOTE", XGBClassifier(random_state=1, eval_metric="logloss"))
)
models_over.append(("dtree_SMOTE", DecisionTreeClassifier(random_state=1)))
results_over = [] # Empty list to store all model's CV scores
names_over = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models_over:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=model, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
results_over.append(cv_result_over)
names_over.append(name)
print("{}: {:.3f}".format(name, cv_result_over.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models_over:
model.fit(X_train_over, y_train_over)
scores_over = recall_score(y_train_over, model.predict(X_train_over)) * 100
print("{}: {:.3f}".format(name, scores_over))
print("\n" "Validation Performance:" "\n")
for name, model in models_over:
model.fit(X_train_over, y_train_over)
scores_over_val = recall_score(y_val, model.predict(X_val)) * 100
print("{}: {:.3f}".format(name, scores_over_val))
Cross-Validation Performance: Bagging_SMOTE: 96.215 RandomForest_SMOTE: 96.549 GBM_SMOTE: 97.490 Adaboost_SMOTE: 96.627 Xgboost_SMOTE: 97.941 dtree_SMOTE: 94.842 Training Performance: Bagging_SMOTE: 99.804 RandomForest_SMOTE: 100.000 GBM_SMOTE: 98.137 Adaboost_SMOTE: 97.058 Xgboost_SMOTE: 100.000 dtree_SMOTE: 100.000 Validation Performance: Bagging_SMOTE: 84.969 RandomForest_SMOTE: 77.914 GBM_SMOTE: 88.650 Adaboost_SMOTE: 88.344 Xgboost_SMOTE: 89.877 dtree_SMOTE: 81.902
# Plotting boxplots for CV scores of all Oversampled models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("SMOTE Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results_over)
ax.set_xticklabels(names_over)
plt.show()
### Undersampling train data
rus = RandomUnderSampler(random_state=1)
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_under == 1)))
print(
"After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_under == 0))
)
print("After Under Sampling, the shape of train_X: {}".format(X_train_under.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_under.shape))
Before Under Sampling, counts of label 'Yes': 976 Before Under Sampling, counts of label 'No': 5099 After Under Sampling, counts of label 'Yes': 976 After Under Sampling, counts of label 'No': 976 After Under Sampling, the shape of train_X: (1952, 46) After Under Sampling, the shape of train_y: (1952,)
models_under = [] # Empty list to store all the models
# Appending models into the list
models_under.append(("Bagging_un", BaggingClassifier(random_state=1)))
models_under.append(("RandomForest_un", RandomForestClassifier(random_state=1)))
models_under.append(("GBM_un", GradientBoostingClassifier(random_state=1)))
models_under.append(("Adaboost_un", AdaBoostClassifier(random_state=1)))
models_under.append(
("Xgboost_un", XGBClassifier(random_state=1, eval_metric="logloss"))
)
models_under.append(("dtree_un", DecisionTreeClassifier(random_state=1)))
results_under = [] # Empty list to store all model's CV scores
names_under = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models_under:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_under = cross_val_score(
estimator=model, X=X_train_under, y=y_train_under, scoring=scoring, cv=kfold
)
results_under.append(cv_result_under)
names_under.append(name)
print("{}: {:.3f}".format(name, cv_result_under.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models_under:
model.fit(X_train_under, y_train_under)
scores_under = recall_score(y_train_under, model.predict(X_train_under)) * 100
print("{}: {:.3f}".format(name, scores_under))
print("\n" "Validation Performance:" "\n")
for name, model in models_under:
model.fit(X_train_under, y_train_under)
scores_under_val = recall_score(y_val, model.predict(X_val)) * 100
print("{}: {:.3f}".format(name, scores_under_val))
Cross-Validation Performance: Bagging_un: 90.267 RandomForest_un: 92.624 GBM_un: 93.957 Adaboost_un: 92.625 Xgboost_un: 95.493 dtree_un: 89.038 Training Performance: Bagging_un: 98.975 RandomForest_un: 100.000 GBM_un: 97.951 Adaboost_un: 94.877 Xgboost_un: 100.000 dtree_un: 100.000 Validation Performance: Bagging_un: 91.718 RandomForest_un: 92.945 GBM_un: 94.172 Adaboost_un: 94.172 Xgboost_un: 96.626 dtree_un: 88.344
# Plotting boxplots for CV scores of all Oversampled models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("UNDER Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results_under)
ax.set_xticklabels(names_under)
plt.show()
The best Cross Validation performance happened using oversampling, on the other hand undersampling is showing more consistenc between Cross Validation and Validation Set. Considering that on oversampling all the validation scores were not on the range of cross validation and with low score, we gonna choose use the data set undersampling where validation score lay on the range of cross validation, hence is the best aproach for us.
Our top 3 better expected performance on unseen data considering Cross Validation and Performance on Validation set happens with UNDERSAMPLING data set on the following models:
We will tune XGB, GBM and Adaboost models using Randomized Search CV.
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
Adaboost_default = AdaBoostClassifier(random_state=1)
Adaboost_default.fit(X_train_under, y_train_under)
AdaBoostClassifier(random_state=1)
# Calculating different metrics on train set
Adaboost_df_train = model_performance_classification_sklearn(
Adaboost_default, X_train_under, y_train_under
)
# Calculating different metrics on validation set
Adaboost_df_val = model_performance_classification_sklearn(
Adaboost_default, X_val, y_val
)
print("Training performance:")
print(Adaboost_df_train)
print("Validation performance:")
print(Adaboost_df_val)
Training performance: Accuracy Recall Precision F1 0 0.943 0.949 0.938 0.943 Validation performance: Accuracy Recall Precision F1 0 0.921 0.942 0.684 0.792
%%time
# Choose the type of classifier.
Adaboost_tuned = AdaBoostClassifier(random_state=1)
# Grid of parameters to choose from
parameters = {
"n_estimators": np.arange(10, 110, 10),
"learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
Adaboost_tuned_random = RandomizedSearchCV(estimator=Adaboost_tuned, param_distributions=parameters, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
Adaboost_tuned_random.fit(X_train_under,y_train_under)
print("Best parameters are {} with CV score={}:" .format(Adaboost_tuned_random.best_params_,Adaboost_tuned_random.best_score_))
Best parameters are {'n_estimators': 90, 'learning_rate': 0.1, 'base_estimator': DecisionTreeClassifier(max_depth=3, random_state=1)} with CV score=0.949811616954474:
Wall time: 12.2 s
# building model with best parameters
Adaboost_best = AdaBoostClassifier(
random_state=1,
n_estimators=90,
learning_rate=0.1,
base_estimator=DecisionTreeClassifier(max_depth=3, random_state=1),
)
# Fit the model on training data
Adaboost_best.fit(X_train_under, y_train_under)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
random_state=1),
learning_rate=0.1, n_estimators=90, random_state=1)
# Calculating different metrics on train set
Adaboost_train = model_performance_classification_sklearn(
Adaboost_best, X_train_under, y_train_under
)
print("Training performance:")
Adaboost_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.986 | 0.995 | 0.977 | 0.986 |
# Calculating different metrics on validation set
Adaboost_val = model_performance_classification_sklearn(Adaboost_best, X_val, y_val)
print("Validation performance:")
Adaboost_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.937 | 0.951 | 0.735 | 0.829 |
# creating confusion matrix
confusion_matrix_sklearn(Adaboost_best, X_val, y_val)
GB_default = GradientBoostingClassifier(random_state=1)
GB_default.fit(X_train_under, y_train_under)
GradientBoostingClassifier(random_state=1)
# Calculating different metrics on train set
GB_df_train = model_performance_classification_sklearn(
GB_default, X_train_under, y_train_under
)
# Calculating different metrics on validation set
GB_df_val = model_performance_classification_sklearn(GB_default, X_val, y_val)
print("Training performance:")
print(GB_df_train)
print("Validation performance:")
print(GB_df_val)
Training performance: Accuracy Recall Precision F1 0 0.972 0.980 0.966 0.973 Validation performance: Accuracy Recall Precision F1 0 0.933 0.942 0.724 0.819
%%time
# Choose the type of classifier.
GB_tuned = GradientBoostingClassifier(
init=AdaBoostClassifier(random_state=1), random_state=1
)
# Grid of parameters to choose from
## add from article
parameters ={"n_estimators": np.arange(50,200,25),
"subsample":[0.5,0.7,0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1],
"learning_rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
GB_tunned_random = RandomizedSearchCV(estimator=GB_tuned, param_distributions=parameters, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
GB_tunned_random.fit(X_train_under,y_train_under)
print("Best parameters are {} with CV score={}:" .format(GB_tunned_random.best_params_,GB_tunned_random.best_score_))
Best parameters are {'subsample': 1, 'n_estimators': 175, 'max_features': 0.8, 'learning_rate': 0.2} with CV score=0.9549188906331763:
Wall time: 17.2 s
# building model with best parameters
GB_best = GradientBoostingClassifier(
random_state=1,
n_estimators=175,
subsample=1,
max_features=0.8,
learning_rate=0.2,
)
# Fit the model on training data
GB_best.fit(X_train_under, y_train_under)
GradientBoostingClassifier(learning_rate=0.2, max_features=0.8,
n_estimators=175, random_state=1, subsample=1)
# Calculating different metrics on train set
GB_train = model_performance_classification_sklearn(
GB_best, X_train_under, y_train_under
)
print("Training performance:")
GB_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.999 | 1.000 | 0.998 | 0.999 |
# Calculating different metrics on validation set
GB_val = model_performance_classification_sklearn(GB_best, X_val, y_val)
print("Validation performance:")
GB_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.944 | 0.954 | 0.759 | 0.845 |
# creating confusion matrix
confusion_matrix_sklearn(GB_best, X_val, y_val)
XGB_default = XGBClassifier(random_state=1, eval_metric="logloss")
XGB_default.fit(X_train_under, y_train_under)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=12,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
XGB_df_train = model_performance_classification_sklearn(
XGB_default, X_train_under, y_train_under
)
# Calculating different metrics on validation set
XGB_df_val = model_performance_classification_sklearn(XGB_default, X_val, y_val)
print("Training performance:")
print(XGB_df_train)
print("Validation performance:")
print(XGB_df_val)
Training performance: Accuracy Recall Precision F1 0 1.000 1.000 1.000 1.000 Validation performance: Accuracy Recall Precision F1 0 0.941 0.966 0.745 0.841
%%time
# defining model
XGB_tunned = XGBClassifier(random_state=1,eval_metric='logloss')
# Parameter grid to pass in RandomizedSearchCV
parameters ={"n_estimators": np.arange(50,200,25),
"subsample":[0.5,0.7,0.8,0.9,1],
"learning_rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
xgb_tuned_random = RandomizedSearchCV(estimator=XGB_tunned, param_distributions=parameters, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
xgb_tuned_random.fit(X_train_under,y_train_under)
print("Best parameters are {} with CV score={}:" .format(xgb_tuned_random.best_params_,xgb_tuned_random.best_score_))
Best parameters are {'subsample': 0.8, 'n_estimators': 175, 'learning_rate': 0.2} with CV score=0.9559445316588174:
Wall time: 21 s
# building model with best parameters
XGB_best = XGBClassifier(
random_state=1,
eval_metric="logloss",
learning_rate=0.2,
subsample=0.8,
n_estimators=175,
)
# Fit the model on training data
XGB_best.fit(X_train_over, y_train_over)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.2, max_delta_step=0,
max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=175, n_jobs=12,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
XGB_train = model_performance_classification_sklearn(
XGB_best, X_train_under, y_train_under
)
print("Training performance:")
XGB_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
XGB_val = model_performance_classification_sklearn(XGB_best, X_val, y_val)
print("Validation performance:")
XGB_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.972 | 0.908 | 0.916 | 0.912 |
# creating confusion matrix
confusion_matrix_sklearn(XGB_best, X_val, y_val)
# training performance comparison
models_train_comp_df = pd.concat(
[
Adaboost_train.T,
Adaboost_val.T,
GB_train.T,
GB_val.T,
XGB_train.T,
XGB_val.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Adaboost Train",
"Adaboost Validation",
"GB Train",
"GB Validation",
"XGB Train",
"XGB Validation",
]
print("Training X Validation performance on Random Search:")
models_train_comp_df
Training X Validation performance on Random Search:
| Adaboost Train | Adaboost Validation | GB Train | GB Validation | XGB Train | XGB Validation | |
|---|---|---|---|---|---|---|
| Accuracy | 0.986 | 0.937 | 0.999 | 0.944 | 1.000 | 0.972 |
| Recall | 0.995 | 0.951 | 1.000 | 0.954 | 1.000 | 0.908 |
| Precision | 0.977 | 0.735 | 0.998 | 0.759 | 1.000 | 0.916 |
| F1 | 0.986 | 0.829 | 0.999 | 0.845 | 1.000 | 0.912 |
# Calculating different metrics on validation set
GB_val = model_performance_classification_sklearn(GB_best, X_test, y_test)
print("Test dataset performance:")
GB_val
Test dataset performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.941 | 0.960 | 0.746 | 0.840 |
# creating confusion matrix
confusion_matrix_sklearn(GB_best, X_val, y_val)
feature_names = X_train.columns
importances = GB_best.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
For categorical columns, we will do one hot encoding and missing value imputation as pre-processing
We are doing missing value imputation for the whole data, so that if there is any missing value in the data in future that can be taken care of.
# List of Categorical Variables
cat_col
['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 'Dependent_count', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon']
# Creating List of Numerical Variables
num_col = []
for col in data1.columns:
if col not in cat_col:
num_col.append(col)
num_col.remove("Attrition_Flag")
num_col
['Customer_Age', 'Months_on_book', 'Credit_Limit', 'Total_Revolving_Bal', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
# ist of numerical variables
numerical_features = num_col
# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
# List of categorical variables
categorical_features = cat_col
# creating a transformer for categorical variables, which will first apply simple imputer and
# then do one hot encoding for categorical variables
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
# handle_unknown = "ignore", allows model to handle any unknown category in the test data
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
# Separating target variable and other variables
X = data1.drop("Attrition_Flag", axis=1)
Y = data1["Attrition_Flag"]
# Now we already know the best model we need to process with, so we don't need to divide data into 3 parts
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(7088, 18) (3039, 18)
### Undersampling train data
rus = RandomUnderSampler(random_state=1)
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)
# Creating new pipeline with best parameters
model = Pipeline(
steps=[
("pre", preprocessor),
(
"GBoost",
GradientBoostingClassifier(
random_state=1,
n_estimators=175,
subsample=1,
max_features=0.8,
learning_rate=0.2,
),
),
]
)
# Fit the model on under sampling training data
model.fit(X_train_under, y_train_under)
Pipeline(steps=[('pre',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median'))]),
['Customer_Age',
'Months_on_book',
'Credit_Limit',
'Total_Revolving_Bal',
'Total_Amt_Chng_Q4_Q1',
'Total_Trans_Amt',
'Total_Trans_Ct',
'Total_Ct_Chng_Q4_Q1',
'Avg_Utilization_Ratio']),
('cat',
Pipeline(steps=[('...
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['Gender', 'Education_Level',
'Marital_Status',
'Income_Category',
'Card_Category',
'Dependent_count',
'Total_Relationship_Count',
'Months_Inactive_12_mon',
'Contacts_Count_12_mon'])])),
('GBoost',
GradientBoostingClassifier(learning_rate=0.2, max_features=0.8,
n_estimators=175, random_state=1,
subsample=1))])
# Calculating different metrics on Test set
GB_val_pipe = model_performance_classification_sklearn(model, X_test, y_test)
print("Pipeline - Test dataset performance:")
GB_val_pipe
Pipeline - Test dataset performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.950 | 0.961 | 0.779 | 0.861 |